
# R script for conducting additional analysis beyond the main results
# This file replicates key statistics and Figure 1 in the main text and Tables A12-18 in the appendix

rm(list = ls())

if (!require('tidyverse')) install.packages('tidyverse')
if (!require('magrittr')) install.packages('magrittr')
if (!require('broom')) install.packages('broom')
if (!require('cld2')) install.packages('cld2')
if (!require('kableExtra')) install.packages('kableExtra')

export_tables <- TRUE
export_figures <- TRUE

if (export_tables & !dir.exists('Tables/')) dir.create('Tables/')
if (export_figures & !dir.exists('Figures/')) dir.create('Figures/')

# Load data ---------------------------------------------------------------

# Make sure all the referenced datasets are placed in the same (sub)dir as this script

dat_raw <- readRDS("Dataset_raw.RDS") # raw complete dataset

dat_all = readRDS("Dataset_cleaned.RDS") # prepared subsets
dat_pub = dat_all$`publication-level`
dat_falc = dat_all$`faculty-level`
dat_dept = dat_all$`department-level`
rm(dat_all)

# Figure 1: coverage map --------------------------------------------------

if (!require("rnaturalearth")) install.packages("rnaturalearth")
if (!require("rnaturalearthdata")) install.packages("rnaturalearthdata")
if (!require("ggthemes")) install.packages("ggthemes")

n_schools <- dat_raw |>
  dplyr::select(region, country, university) |>
  dplyr::distinct() |>
  dplyr::group_by(region, country) |>
  dplyr::summarise(n_school = n()) |>
  dplyr::ungroup()

world_map <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf")

n_schools$country[which(is.na(match(n_schools$country, world_map$name_en)))] # a few unmatched country names
world_map$name_en2 = world_map$name_en
world_map$name_en2[world_map$name_en=="People's Republic of China"] <- "China"
world_map$name_en2[world_map$name_en=="United Arab Emirates"] <- "UAE"
world_map$name_en2[world_map$name_en=="Czech Republic"] <- "Czechia"
world_map$name_en2[world_map$name_en=="United States of America"] <- "United States"
sum(is.na(match(n_schools$country, world_map$name_en2))) == 0 # OK now

world_schools <- world_map |>
  dplyr::filter(name != "Antarctica") |>
  dplyr::left_join(n_schools, by = c("name_en2" = "country")) |>
  dplyr::mutate(is_covered = ifelse(is.na(n_school) | n_school==0, 0, 1)) 

ggplot(data = world_schools) +
  geom_sf(aes(fill = factor(is_covered))) +
  scale_fill_manual(breaks = c(1,0), 
                    values = c("#003366","white"), 
                    labels = c("Yes", "No"),
                    name = "Coverage"
  ) +
  ggthemes::theme_map() +
  theme(
    legend.title = element_text(size = 15),
    legend.text = element_text(size = 12),
    legend.key.size = unit(1, "cm") 
  ) 

if (export_figures) { # save map to a tiff file
  ggsave("Figures/Fig1_map_coverage.tiff", plot = last_plot(), width = 12, height = 6, dpi = 300, bg = "white", device = "tiff")
}

# Stat: total num. of countries and departments ---------------------------

print(length(unique(dat_dept$country))) # Num. of countries = 45
print(nrow(dat_dept)) # Num, of schools = 178

# Stat: total num. of faculty ---------------------------------------------

print(nrow(dat_falc)) # Num, of faculty = 5586

# Stat: total num. of publications ----------------------------------------

dat_npubs <- dat_raw |>
  dplyr::filter(title %in% dat_pub$publication) |>
  dplyr::select(title, publisher, journal) |>
  dplyr::distinct() |> # 128,123 unique publications
  dplyr::mutate(book = ifelse(!is.na(publisher) & publisher!=" "& publisher!="", 1, 0)) |>
  dplyr::summarise(n_book = sum(book), n_journal = sum(1-book))

print(dat_npubs)
# n_book n_journal
# 12696    115427

# Stat: language coverage -------------------------------------------------

lang_pub <- cld2::detect_language(dat_pub$publication, plain_text = FALSE, lang_code = FALSE)
df_lang_pub <- cbind.data.frame(publication = dat_pub$publication, language = lang_pub)
n_lang_pub <- df_lang_pub |>
  dplyr::filter(!is.na(language)) |>
  dplyr::group_by(language) |>
  dplyr::summarise(n_pub = n()) |>
  dplyr::ungroup() |>
  dplyr::arrange(desc(n_pub)) 

n_lang_pub_top5 <- n_lang_pub[1:5,] |>
  dplyr::mutate(pct_pub = n_pub/length(dat_pub$publication))
print(n_lang_pub_top5)
# 1 ENGLISH  99529  0.774 
# 2 SPANISH   4498  0.0350
# 3 RUSSIAN   3920  0.0305
# 4 CHINESE   3573  0.0278
# 5 FRENCH    3254  0.0253
sum(n_lang_pub_top5$pct_pub) # 0.8928285

n_lang_pub |>
  dplyr::mutate(pct_pub = n_pub/length(dat_pub$publication)) |>
  dplyr::filter(pct_pub>=0.01)
# 1 ENGLISH    99529  0.774 
# 2 SPANISH     4498  0.0350
# 3 RUSSIAN     3920  0.0305
# 4 CHINESE     3573  0.0278
# 5 FRENCH      3254  0.0253
# 6 PORTUGUESE  2180  0.0170
# 7 JAPANESE    1668  0.0130

# n_lang_pub |>
#   dplyr::mutate(pct_pub = n_pub/length(dat_pub$publication)) |>
#   dplyr::slice_head(n=10)

print(sum(n_lang_pub$n_pub/length(dat_pub$publication))) # 97.5%
# sum(!is.na(df_lang_pub$language))/nrow(df_lang_pub) # 97.5%
print(sum(is.na(df_lang_pub$language))/length(dat_pub$publication)) # 2.5%

# Tables A12-A17: correlations in scores ----------------------------------

## define correlation table helpers ----

tabulate_correlations <- function(
    df = dat_dept, var1 = "citations", var2 = "citations_recent", 
    table_caption="Correlation between total and recent citations (2018-2022).", table_label="corr_cites") {
  corr_regions <- df |> 
    dplyr::filter(!region=="Africa") %>%  # Africa has only two schools
    split(.$region) |>
    purrr::map(function(df) cor.test(df[[var1]], df[[var2]])) |>
    purrr::map(broom::tidy) |>
    dplyr::bind_rows(.id = "region")
  corr_global <- df %$%
    cor.test(.[[var1]], .[[var2]]) |>
    broom::tidy() |>
    dplyr::mutate(region="Global", .before = 1L)
  rbind.data.frame(corr_regions, corr_global) %>% 
    dplyr::left_join(
      data.frame(region=c("Global","North America", "Europe", "Asia", "Oceania", "Latin America")),
      ., by = "region"
    ) |>
    dplyr::select(region, estimate, statistic, `p.value`, `conf.low`, `conf.high`) |>
    dplyr::mutate(conf = paste0("[", round(`conf.low`, 3), ", ",round(`conf.high`, 3), "]" )) |>
    dplyr::select(-`conf.low`, -`conf.high`) |>
    kbl(
      col.names = c(" ", "Pearson's $\\rho$", "$t$-statistic", "$p$-value", "95\\% CI"),
      escape = FALSE, linesep = "",
      digits = 3, booktabs = TRUE, 
      format = "latex", label = table_label,
      caption = table_caption) |>
    footnote(general = c("Africa has too few schools in the sample (N=2) to permit a separate correlation analysis."), 
             general_title = "Note:",
             footnote_as_chunk = T)
}

## Table A12: corr(citations, recent citations) ----

TabA12 <- tabulate_correlations()
if (export_tables) writeLines(TabA12, 'Tables/TabA12.tex')

## Table A13: corr(total citations, average citations) ----

TabA13 <- tabulate_correlations(
  var1 = "citations", var2 = "citations_pf",
  table_caption = "Correlation between total and per-faculty citations.", 
  table_label = "corr_cites_pf")
if (export_tables) writeLines(TabA13, 'Tables/TabA13.tex')

## Table A14: corr(impact, recent impact) ----

TabA14 <- tabulate_correlations(
  var1 = "impact", var2 = "impact_recent",
  table_caption="Correlation between total and recent impacts (2018-2022).", 
  table_label="corr_impact"
  )
if (export_tables) writeLines(TabA14, 'Tables/TabA14.tex')

## Table A15: corr(total impact, average impact) ----

TabA15 <- tabulate_correlations(
  var1 = "impact", var2 = "impact_pf",
  table_caption = "Correlation between total and per-faculty impacts.", 
  table_label = "corr_impact_pf"
  )
if (export_tables) writeLines(TabA15, 'Tables/TabA15.tex')

## Table A16: corr(top publications, recent top publications) ----

TabA16 <- tabulate_correlations(
  var1 = "pub_top", var2 = "pub_top_recent",
  table_caption = "Correlation between total and recent top publications (2018-2022).", 
  table_label = "corr_toppubs"
  )
if (export_tables) writeLines(TabA16, 'Tables/TabA16.tex')

## Table A16: corr(total top publications, average top publications) ----

TabA17 <- tabulate_correlations(
  var1 = "pub_top", var2 = "pub_top_pf",
  table_caption = "Correlation between total and per-faculty top publications.", 
  table_label = "corr_toppubs_pf"
  )
if (export_tables) writeLines(TabA17, 'Tables/TabA17.tex')

# END #
